Disc to the Future 2

home *** CD-ROM | disk | FTP | other *** search

/ Disc to the Future 2 / Disc to the Future Part II Programmer's Reference (Wayzata Technology)(6013)(1992).bin / MAC / THINKC / 4_0 / REXP_SRC / REGEXP.C < prev next >

Wrap

Text File | 1989-09-18 | 16KB | 564 lines

/* | | Regular Expression Evaluator: | | | | Greg Anderson | | 29 Kerr Hall | | Social Sciences Computing | | University of California, Santa Cruz | | sirkm@ssyx | | | | For use with HyperCard XCMDs, XFCNs, and possibly other things. | | | | Make this file part of your XCMD project. | | #include "regexp.h" in any of your files that may use routines | | from this package. | | | | The regular expressions this package matches is as follows: | | | | c Any ordinary character 'c' not listed below matches that | | character. | | | | \c A backslash (\) followed by a special character (one of | | '.', '*', '+', '[' and '\') matches the special character | | (i.e., the special meaning is removed). | | | | . A period (.) matches any single character except RETURN. | | | | [string] A non-empty string of characters enclosed in square | | brackets matches any single character found in the set. | | If the first character of such a string is ^, then | | any single character NOT in the set is matched. '^' | | looses its special meaning if it comes first in the | | string. | | The character '-' indicates a range of characters; | | for example, [a-z] will match any lowercase letter. | | '-' looses its special meaning if it comes first (or | | after a leading '^') or last in the string. | | | | c* Any one-character regular expression followed by a * | | matches zero or more occurances of the single character. | | If there is any choice, the longest leftmost string | | that matches is returned. | | | | c+ Like '*', but matches one or more occurances of the | | single character regular expression. | | | | ^ A caret (^) at the beginning of an entire regular | | expression constrains that regular expression to only | | match strings found at the beginning of a line. | | | | $ A currency symbol ($) at the end of an entire line | | constrains that regular expression to only match strings | | found at the end of a line. | | | | The following regular expressions are NOT supported: | | | | \< Beginning of word | | \> End of word | | $ ... $ "..." is treated as a regular expression | | \{n,m\} Repeated matches of previous regular expression. | | */ #include <MacTypes.h> #include <FileMgr.h> #include "regexp.h" #define TRUE 1 #define FALSE 0 #define toupper(c) ((c>='a')&&(c<='z') ? (c-('a'-'A')) : c) int regexp_flags; /*----------------------------------------------------------------- | end_of_line: | | Checks to see if the given character pointer points at the end | of a line. | | Lines end in either a return character (\r) or a null. | | If MULTILINE is true, then logical lines may be continued on | multiple physical lines if succeeding physical lines are indented. | | If FOLDEDLINE is true, then logical lines may be continued on | multiple physical lines by preceeding each return character with | a backslash. | | If NOBREAKS is true, then there are no line breaks; the entire text | field is treated as one long line. ^ matches only at the beginning | of the text field, and $ matches only at the end. | | INPUTS: line_ptr: A pointer into the line | | OUTPUTS: None save the return value. | | RETURNS: TRUE End of line reached | FALSE Not at the end of the line -----------------------------------------------------------------*/ int end_of_line(line_ptr) char *line_ptr; { if( !(*line_ptr) ) return(TRUE); if( regexp_flags & NOBREAKS ) return(FALSE); if( *line_ptr != '\r' ) return(FALSE); if( !(*(line_ptr+1)) ) return(TRUE); if( (regexp_flags & MULTILINE) && (*(line_ptr+1) <= ' ') ) return(FALSE); if( (regexp_flags & FOLDEDLINE) && (*(line_ptr-1) == '\\') ) return(FALSE); return(TRUE); } /*----------------------------------------------------------------- | find_regexp: | | Searches for occurances of 'regexp' inside of 'line'. | | 'regexp' must have had some prior processing--leading ^ and | trailing '$' should be stripped before calling. Note that | 'greplen' will do this preprocessing. | | INPUTS: regexp: A pointer to the regular expression | line: A pointer to the line to search | start: If zero, then 'regexp' must match 'line' | starting with the first character of 'line'. | end: If zero, then 'regexp' must also match | 'line' all the way to the end. | | OUTPUTS: start: If specified, start will be changed to | point to the first character in 'line' | that matched 'regexp'. If 'regexp' | could be matched in multiple ways | (due to wildcards), the leftmost string | is returned. | end: If specified, end will be changed to | point to the first character in 'line' | that was not part of 'regexp'. If | 'regexp' could be matched in multiple ways | (due to wildcards), the longest string | that matches is selected. | | RETURNS: TRUE 'regexp' was found in 'line' | FALSE 'regexp' not found--'start' and 'end' are | invalid. -----------------------------------------------------------------*/ int find_regexp(regexp,line,start,end) char *regexp, *line, **start, **end; { if( !start ) return( strgrep(regexp,line,end) ); while( !end_of_line(line) ) { if( strgrep(regexp,line,end) ) { *start = line; return(TRUE); } ++line; } /* | | Special case -- searching for the end of a line and nothing else. */ if( !(*regexp) && !(*end) ) { *start = line; return(TRUE); } return(FALSE); } /*----------------------------------------------------------------- | strgrep: | | Checks to see if the regular expression 'regexp' matches the | search line provided. The match must be EXACT: 'line' is not | searched for occurances of 'regexp', it is only checked to see | if 'regexp' matches 'line' starting with the first character. | ('line' may have unmatched trailing characters, however.) | | INPUTS: regexp: A pointer to the regular expression | line: A pointer to the line to search | end: If zero, then 'regexp' must also match | 'line' all the way to the end. | | OUTPUTS: end: If specified, end will be changed to | point to the first character in 'line' | that was not part of 'regexp'. If | 'regexp' could be matched in multiple ways | (due to wildcards), the longest string | that matches is selected. -----------------------------------------------------------------*/ int strgrep(regexp,line,end) char *regexp, *line, **end; { char *last = 0; /* | | Search over every character in the comparitor string */ while( *regexp ) { /* | | If we have reached the end of the line but there are | | still characters in the regular expression, then the | | search has probably failed. | | | | Wildcards in the regular expression can make things | | a bit trickier, though. */ if( end_of_line(line) ) { if( strcmp( regexp,"*" ) == 0 ) break; if( strcmp( regexp+1,"*" ) == 0 ) break; return(FALSE); } if( !chargrep(®exp,&line,&last) ) { /* | | The search character does not match: if the next regular | | expression is not a '*', then the search has FAILED. */ if( *regexp != '*' ) return(FALSE); else { /* | | Back up the line pointer so that the same | | character may be checked against the next | | element in the regular expression string */ last = 0; --line; ++regexp; } } } /* | | If we are searching to the END of the line, then the input | | line must be out of valid characters in order to return | | a match. */ if( !end ) return( end_of_line(line) ); *end = line; return(TRUE); } /*----------------------------------------------------------------- | chargrep: | | Compares just one character in the regular expression | | INPUTS: All inputs are pointers to pointers to strings, as | chargrep will advance these pointers after comparing | them. | | regexp: Points into the regular expression | line: Points into the line being searched | last: Points at the last character checked in | the regular expression; usually = (*regexp-1). | | OUTPUTS: regexp: Advanced to the next char in the reg exp. | line: Advanced to the next char in search line | last: Set to the initial value of 'regexp'. -----------------------------------------------------------------*/ int chargrep(regexp,line,last) char **regexp, **line, **last; { char c = **line, *look = *regexp; int match; switch( **regexp ) { /* | | Set search? */ case '[': *last = look; ++(*line); return( searchset(regexp,c) ); /* | | '.' Wildcard matches any single character except newline / return | | c can only be a newline/return if one of the flags -m, -f or -b | | was specified. */ case '.': if( (c != '\r') && (c != '\n') ) c = '.'; break; /* | | Wildcards: */ case '*': case '+': /* | | When a wild card is found, the line is scanned | | until the last part of the regular expression | | can be found somewhere in the line. | | | | If the last part of the regular expression is | | found multiple times, the longest applicable | | match is returned. */ if( !(*last) ) *last = "."; match = wild_scan(*regexp+1,line,*last); /* | | Fixup for '*'-style searches. */ if( !match && **regexp == '*' ) match = strgrep(*regexp+1,(*line-1),line); ++(*line); *regexp = ""; return(match); /* | | Backslash escape: next character interpreted literally | | | | Note: Should check for \nnn (octal representation) */ case '\\': ++(*regexp); break; } /* | | At this point, 'c' contains the character from the search | | line that must be matched in the regular expression | | (EXACTLY). If c does not match the regular expression, | | then the search still will not fail if the next character | | in the regexp is a '*' */ if( regexp_flags & IGNORE ) c = toupper(c); match = (**regexp == c); /* | | Set 'last' = the initial value of the regular expression ptr | | and advance the regexp and line pointers. */ ++(*regexp); ++(*line); *last = look; return(match); } /*----------------------------------------------------------------- | searchset: | | Compares a [list] in the regular expression with just one | character in the input line. | | INPUTS: regexp: A pointer to a pointer into the regular | expression | check_c: The character to check. | | Enter with a pointer to a pointer into the regular expression | Upon entry, the regexp pointer should point at the '['. | Upon exit, it will point to the character AFTER the ']'. | | RETURNS: TRUE: 'check_c' was in the set | FALSE: 'check_c' was not in the set -----------------------------------------------------------------*/ int searchset(regexp,check_c) char **regexp, check_c; { char c, /* The char from the set */ lc = 0; /* The last char from set */ int found = 0, /* Flag: found check_c? */ invert = 0; /* Flag: inverted search */ /* | | Advance past the '[' and check for a leading '^' */ ++(*regexp); c = **regexp; if( c == '^' ) { ++invert; ++(*regexp); c = **regexp; } ++(*regexp); do { if( regexp_flags & IGNORE ) c = toupper(c); if( (c == '-') && lc ) { /* | | Check if the character lies within a range */ if( (lc <= check_c) && (**regexp >= check_c) ) found = 1; lc = 0; } /* | | Check if this character in the regexp list matches the | | character being checked. */ else if( c == check_c ) found = 1; lc = c; } while( (c = *((*regexp)++) ) != ']' ); return( found ^ invert ); } /*----------------------------------------------------------------- | wild_scan: | | Regular expression wildcard handling. Searches for the last part | of a regular expression (after a wildcard) in a line. | | INPUTS: regexp: A pointer to a pointer into the regular | expression (points to the character after | the wildcard) | line: A pointer to a pointer into the line being | searched (points at the character to start | searching at) | last: A pointer to the last character in the regexp | before the wildcard. | | OUTPUTS: regexp: ALWAYS points to the null terminator at the | end of regexp. | line: points to the last character matched, if there | was a match. Otherwise unchanged. | | RETURNS: TRUE: The pattern matched; line points to the | first character not matched. | FALSE: The pattern did not match. -----------------------------------------------------------------*/ wild_scan(regexp,line,last) char *regexp, **line, *last; { char *scan = *line, *copy_of_last, *dummy; int result = FALSE; while( !end_of_line(scan) ) { /* | | If the last part of the regexp is matched at the current | | possition of 'scan', then remember that a match has been | | found and keep scanning. | | | | If (and only if) regexp is found, strgrep changes 'line' to | | point to the character after the last one matched by regexp. */ if( strgrep(regexp,scan,line) ) result = TRUE; /* | | If the character pointed to by scan does not match | | the regexp character before the wildcard, then | | the scan is terminated. */ copy_of_last = last; if( !chargrep(©_of_last,&scan,&dummy) ) break; } return(result); } /*----------------------------------------------------------------- | greplen: | | Finds the length of a grep search string. In the case of | strings containing wild cards, returns the MINIMUM length string | that could match the search string. | | greplen is also responsible for finding the occurance of ^ and $ | at the beginning and end of the string (respectively). If these | flags are specified, greplen notes this fact & then strips them | from the passed searchstring. | | If the grep search string is not valid, greplen returns -1. -----------------------------------------------------------------*/ int greplen(searchstring) char **searchstring; { char c, *string; int len = 0; if( regexp_flags & IGNORE ) MakeUpper(*searchstring); /* | | Does the search string begin with '^'? */ if( **searchstring == '^' ) { ++(*searchstring); regexp_flags |= BEGINFLAG; } string = *searchstring; /* | | Count the characters in the search string */ while( c = *string++ ) { switch( c ) { /* | | Since '*' might match zero characters, the length of | | the string is decremented by one, since the previous | | character does not have to be matched. */ case '*': if( len ) --len; break; /* | | If a '$' is found at the end, then set the 'END' flag. | | Otherwise, count the $ as a search character. */ case '$': if( (*string) == 0 ) { *(string-1) = 0; regexp_flags |= ENDFLAG; } else ++len; break; /* | | Scan through an entire [string], counting it as only | | one character. When this loop exits, string points to | | the ']', which will be counted in the search length on | | the next pass of the while() loop. */ case '[': if( *string++ < ' ') return(-1); while( *string != ']' ) if( *string++ < ' ' ) return(-1); break; /* | | Backslash falls through to the default case, but it | | first advances past the character after the backslash */ case '\\': if( *string++ < ' ') return(-1); default: ++len; } } return(len); }